# imports
from os.path import join
import pandas as pd
from sklearn.model_selection import train_test_split
from discover_ import Discover
from discover.utils.Timer import Timer
dummy_run = False
disc = Discover(dummy_run=dummy_run)
# load validation data
# HACK: absolute path while still working out dependency structure
data_dir = join("CrabNet", "data", "materials_data", "elasticity")
name = "train.csv" # "example_materials_property_val_output.csv", #elasticity_val_output.csv"
fpath = join(data_dir, name)
df = pd.read_csv(fpath)
# df = df.groupby(by="formula", as_index=False).mean()
# if there are two compounds with the same formula, we're more interested in the higher GPa
group_filter = "max" # "mean"
grp_df = (
df.reset_index()
.groupby(by="formula")
.agg({"index": lambda x: tuple(x), "target": "max"})
.reset_index()
)
# REVIEW: drop pure elements here?
# take small subset
if dummy_run:
n = 100
n2 = 10
train_df = grp_df.iloc[:n, :]
val_df = grp_df.iloc[n : n + n2, :]
else:
# REVIEW: consider changing train_size to 0.2
train_df, val_df = train_test_split(grp_df, train_size=0.8)
# slower if umap_random_state is not None
with Timer("DISCOVER-fit"):
disc.fit(train_df)
C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\pandas\core\indexing.py:1667: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self.obj[key] = value
Model architecture: out_dims, d_model, N, heads 3, 512, 3, 4 Running on compute device: cuda:0 Model size: 11987206 parameters
Generating EDM: 100%|██████████| 8572/8572 [00:00<00:00, 156165.76formulae/s]
loading data with up to 6 elements in the formula training with batchsize 512 (2**9.000) stepping every 170 training passes, cycling lr every 10 epochs checkin at 20 epochs to match lr scheduler Epoch: 0/40 --- train mae: 53 val mae: 53 Epoch: 19/40 --- train mae: 12.7 val mae: 12.7 Epoch: 39/40 --- train mae: 9.64 val mae: 9.64 Saving network (test-property) to models/trained_models/test-property.pth [train-CrabNet] Elapsed: 89.82874 [DISCOVER-fit] Elapsed: 89.82974
with Timer("DISCOVER-predict"):
score = disc.predict(val_df)
C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\pandas\core\indexing.py:1773: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self._setitem_single_column(ilocs[0], value, pi) Generating EDM: 100%|██████████| 8572/8572 [00:00<00:00, 182869.33formulae/s]
loading data with up to 6 elements in the formula
C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\pandas\core\indexing.py:1667: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self.obj[key] = value Generating EDM: 100%|██████████| 2143/2143 [00:00<00:00, 165285.55formulae/s]
loading data with up to 6 elements in the formula Fitting mod_petti kernel matrix Constructing distances [fit-wasserstein] Elapsed: 17.19802
C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\umap\umap_.py:1735: UserWarning: using precomputed metric; transform will be unavailable for new data and inverse_transform will be unavailable for all data warn(
[fit-UMAP] Elapsed: 45.07268
C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\umap\umap_.py:1735: UserWarning: using precomputed metric; transform will be unavailable for new data and inverse_transform will be unavailable for all data warn(
[fit-vis-UMAP] Elapsed: 17.08833 [HDBSCAN*] Elapsed: 0.14162 [pdf-summation] Elapsed: 10.41885 [train-val-pdf-summation] Elapsed: 1.89099 [DISCOVER-predict] Elapsed: 96.64376
with Timer("DISCOVER-plot"):
disc.plot()
C:\Users\sterg\Anaconda3\envs\elm2d-crabnet\lib\site-packages\plotly\graph_objs\_deprecations.py:378: DeprecationWarning: plotly.graph_objs.Line is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.scatter.Line - plotly.graph_objs.layout.shape.Line - etc.
[DISCOVER-plot] Elapsed: 6.43417